#include "config.h"

#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/epoll.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <linux/fs.h>
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <libgen.h>
#include <ctype.h>
#include <fcntl.h>
#include <libaio.h>
#include <limits.h>
#include <errno.h>
#include <sys/vfs.h>

#define DBG_SUBSYS S_LIBREPLICA

#include "cache.h"
#include "disk.h"
#include "sysy_lib.h"
#include "ynet_rpc.h"
#include "job_dock.h"
#include "net_global.h"
#include "diskmd.h"
#include "bh.h"
#include "tpool.h"
#include "lich_md.h"
#include "configure.h"
#include "dbg.h"
#include "replica.h"
#include "nodectl.h"
//#include "disk_aio.h"
#include "system.h"

/*
 * Structure of the super block，持久化到每个磁盘设备的第一个1M头部
 */
struct ext4_super_block {
        /*00*/  __le32	s_vnodes_count;		/* Inodes count */
        __le32	s_blocks_count_lo;	/* Blocks count */
        __le32	s_r_blocks_count_lo;	/* Reserved blocks count */
        __le32	s_free_blocks_count_lo;	/* Free blocks count */
        /*10*/  __le32	s_free_vnodes_count;	/* Free vnodes count */
        __le32	s_first_data_block;	/* First Data Block */
        __le32	s_log_block_size;	/* Block size */
        __le32	s_log_cluster_size;	/* Allocation cluster size */
        /*20*/  __le32	s_blocks_per_group;	/* # Blocks per group */
        __le32	s_clusters_per_group;	/* # Clusters per group */
        __le32	s_vnodes_per_group;	/* # Inodes per group */
        __le32	s_mtime;		/* Mount time */
        /*30*/  __le32	s_wtime;		/* Write time */
        __le16	s_mnt_count;		/* Mount count */
        __le16	s_max_mnt_count;	/* Maximal mount count */
        __le16	s_magic;		/* Magic signature */
        __le16	s_state;		/* File system state */
        __le16	s_errors;		/* Behaviour when detecting errors */
        __le16	s_minor_rev_level;	/* minor revision level */
        /*40*/  __le32	s_lastcheck;		/* time of last check */
        __le32	s_checkinterval;	/* max. time between checks */
        __le32	s_creator_os;		/* OS */
        __le32	s_rev_level;		/* Revision level */
        /*50*/  __le16	s_def_resuid;		/* Default uid for reserved blocks */
        __le16	s_def_resgid;		/* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32	s_first_ino;		/* First non-reserved vnode */
        __le16  s_vnode_size;		/* size of vnode structure */
        __le16	s_block_group_nr;	/* block group # of this superblock */
        __le32	s_feature_compat;	/* compatible feature set */
        /*60*/  __le32	s_feature_incompat;	/* incompatible feature set */
        __le32	s_feature_ro_compat;	/* readonly-compatible feature set */
        /*68*/  __u8	s_uuid[16];		/* 128-bit uuid for volume */
        /*78*/  char	s_volume_name[16];	/* volume name */
        /*88*/  char	s_last_mounted[64];	/* directory where last mounted */
        /*C8*/  __le32	s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
        __u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
        __le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
        /*D0*/  __u8	s_journal_uuid[16];	/* uuid of journal superblock */
        /*E0*/  __le32	s_journal_inum;		/* vnode number of journal file */
        __le32	s_journal_dev;		/* device number of journal file */
        __le32	s_last_orphan;		/* start of list of vnodes to delete */
        __le32	s_hash_seed[4];		/* HTREE hash seed */
        __u8	s_def_hash_version;	/* Default hash version to use */
        __u8	s_jnl_backup_type;
        __le16  s_desc_size;		/* size of group descriptor */
        /*100*/ __le32	s_default_mount_opts;
        __le32	s_first_meta_bg;	/* First metablock block group */
        __le32	s_mkfs_time;		/* When the filesystem was created */
        __le32	s_jnl_blocks[17];	/* Backup of the journal vnode */
        /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
        /*150*/ __le32	s_blocks_count_hi;	/* Blocks count */
        __le32	s_r_blocks_count_hi;	/* Reserved blocks count */
        __le32	s_free_blocks_count_hi;	/* Free blocks count */
        __le16	s_min_extra_isize;	/* All vnodes have at least # bytes */
        __le16	s_want_extra_isize; 	/* New vnodes should reserve # bytes */
        __le32	s_flags;		/* Miscellaneous flags */
        __le16  s_raid_stride;		/* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8	s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8	s_reserved_char_pad;
        __le16  s_reserved_pad;
        __le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
        __le32	s_snapshot_inum;	/* Inode number of active snapshot */
        __le32	s_snapshot_id;		/* sequential ID of active snapshot */
        __le64	s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32	s_snapshot_list;	/* vnode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32	s_error_count;		/* number of fs errors */
        __le32	s_first_error_time;	/* first time an error happened */
        __le32	s_first_error_ino;	/* vnode involved in first error */
        __le64	s_first_error_block;	/* block involved of first error */
        __u8	s_first_error_func[32];	/* function where the error happened */
        __le32	s_first_error_line;	/* line number where error happened */
        __le32	s_last_error_time;	/* most recent time of an error */
        __le32	s_last_error_ino;	/* vnode involved in last error */
        __le32	s_last_error_line;	/* line number where error happened */
        __le64	s_last_error_block;	/* block involved of last error */
        __u8	s_last_error_func[32];	/* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8	s_mount_opts[64];
        __le32	s_usr_quota_inum;	/* vnode for tracking user quota */
        __le32	s_grp_quota_inum;	/* vnode for tracking group quota */
        __le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
        __le32  s_reserved[109];        /* Padding to the end of the block */
};


int disk_load_bitmap(disk_t *disk, const char *home);
int disk_create_bitmap(const disk_t *disk, const char *home, uint64_t disk_size);
int disk_load_tier(disk_t *disk, const char *home);

int disk_sha1(const diskloc_t *loc, char *md)
{
        int ret, fd = -1;
        uint64_t offset = 0;
        unsigned char buf[MAX_BUF_LEN];

#if ENABLE_BMAP_DEBUG
        diskmd_exists(loc, 1);
#endif

        YASSERT(0 && "need achieve");
        ret = _sha1_file(fd, offset, LICH_CHUNK_SPLIT, buf);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        _sha1_print(md, buf);
        DINFO("location (%u, %u) sha1 %s\n", loc->diskid, loc->idx, md);

        return 0;
err_ret:
        return ret;
}

int disk_avaiable(disk_t *disk)
{
        return (disk && (!(disk->status & __DISK_OFFLINE__)));
}

int disk_allocable(disk_t *disk)
{
        return (disk && (!(disk->status & __DISK_OFFLINE__)) && (!(disk->status & __DISK_DELETING__)));
}

int disk_get_realpath(char *path, struct stat* stbuf)
{
        int ret;
        char buf[MAX_PATH_LEN];
        char *tmp;

        ret = lstat(path, stbuf);
        if (ret < 0) {
                ret = errno;
                goto err_ret;
        }

        while (S_ISLNK(stbuf->st_mode)) {
                memset(buf, 0, sizeof(buf));

                ret = readlink(path, buf, sizeof(buf));
                if (ret < 0) {
                        ret = errno;
                        GOTO(err_ret, ret);
                }

                if (buf[0] != '/') {
                        tmp = strrchr(path, '/');
                        strcpy(tmp + 1, buf);
                        strcpy(buf, path);
                }
                tmp = realpath(buf, path);
                (void) tmp;

                ret = lstat(path, stbuf);
                if (ret < 0) {
                        ret = errno;
                        GOTO(err_ret, ret);
                }
        }

        return 0;
err_ret:
        return ret;
}

int disk_getblocksize(const char *path, uint64_t *size)
{
        int ret, fd;

        fd = open(path, O_RDONLY);
        if (fd < 0) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        ret = ioctl(fd, BLKGETSIZE64, size);
        if (ret < 0) {
                ret = errno;
                GOTO(err_fd, ret);
        }

        close(fd);

        return 0;
err_fd:
        close(fd);
err_ret:
        return ret;
}

/*disk->setinfo*/
static int disk_set_diskinfo(const disk_t *disk, diskinfo_t *diskinfo)
{
        int ret;
        char buf[DISK_WRITEABLE_OFFSET - DISKINFO_OFFSET] = {0};

        diskinfo->idx = disk->idx;
        diskinfo->nid = *net_getnid();
        uuid_generate(diskinfo->diskid);
        memset(diskinfo->cluster, 0x0, MAX_NAME_LEN);
        strcpy(diskinfo->cluster, gloconf.uuid);
        diskinfo->crc = crc32_sum((void *)diskinfo + sizeof(uint32_t), sizeof(*diskinfo) - sizeof(uint32_t));

        memcpy(buf, diskinfo, sizeof(diskinfo_t));

        ret = disk->dop->io_pwrite(disk, buf, sizeof(buf), DISKINFO_OFFSET);
        if (ret != sizeof(buf)) {
                DWARN("the disk type is %d, return %d need %lu\n", disk->disk_type,ret, sizeof(*diskinfo));
                ret = EIO;
                GOTO(err_ret, ret);
        }

        return 0;
err_ret:
        return ret;
}

static int disk_set_superblock(const char *home, const char *pool_name, const disk_t *disk, diskinfo_t *diskinfo)
{
        int ret, i;
        struct ext4_super_block super_block;
        char hostname[MAX_NAME_LEN], block[MAX_PATH_LEN];
        char mbr[MBR_SIZE];

        YASSERT(sizeof(super_block) == 1024);

        ret = net_gethostname(hostname, MAX_NAME_LEN);
        if (unlikely(ret)) {
                if (ret == ECONNREFUSED)
                        strcpy(hostname, "N/A");
                else
                        GOTO(err_ret, ret);
        }

        memset(&super_block, 0, sizeof(super_block));

        super_block.s_vnodes_count = 3276800;
        super_block.s_blocks_count_lo = 13107200;
        super_block.s_blocks_per_group = 32768;
        super_block.s_vnodes_per_group = 8192;
        super_block.s_magic = 0xef53;
        super_block.s_state = 1; /* not requisite */
        super_block.s_errors = 1; /* not requisite */
        for (i = 0; i < 16; i++) {
                super_block.s_uuid[i] = diskinfo->diskid[i];
        }

        sprintf(super_block.s_volume_name, "lich-disk%d", disk->idx);

        sprintf((char *)super_block.s_reserved, "cluster=%s;node=%s;type=data;disk=%d;pool=%s;cache=%d;cached=0;cset=%s;",
                        gloconf.uuid, hostname, disk->idx, pool_name, disk->cache, disk->cset_uuid);

        DINFO("disk %d reserved %s\n", disk->idx, (char *)super_block.s_reserved);

        ret = disk->dop->io_pwrite(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        memset(mbr, 0x0, sizeof(mbr));
        mbr[510] = 0x55;
        mbr[511] = 0xaa;

        ret = disk->dop->io_pwrite(disk, mbr, sizeof(mbr), MBR_OFFSET);
        if (ret != sizeof(mbr)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        snprintf(block, MAX_PATH_LEN, "%s/block/%d.block", home, disk->idx);

        ret = _set_value_off(block, (void *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET, O_CREAT | O_TRUNC);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

static int disk_get_superblock(const disk_t *disk, struct ext4_super_block *super_block)
{
        int ret;

        ret = disk->dop->io_pread(disk, (char *)super_block, sizeof(*super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(*super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }


        return 0;
err_ret:
        return ret;
}

int disk_erasure_superblock(disk_t *disk)
{
        int ret;
        struct ext4_super_block super_block;

        memset(&super_block, 0, sizeof(super_block));
        ret = disk->dop->io_pwrite(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        return 0;
err_ret:
        return ret;

}

int disk_setinfo(const char *home, const char *pool_name, const disk_t *disk, diskinfo_t *diskinfo)
{
        int ret;

        DINFO("home %s pool %s disk %d super %lu\n", home, pool_name, disk->idx, sizeof(struct ext4_super_block));

        ret = disk_set_diskinfo(disk, diskinfo);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = disk_set_superblock(home, pool_name, disk, diskinfo);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

static void super_block_decode_pool(struct ext4_super_block *super_block, char *pool)
{
        char *p, *_pool = NULL, tmp[MAX_INFO_LEN];

        strcpy(tmp, (char *)&super_block->s_reserved);

        p = strstr(tmp, "pool=");
        if (p) {
                _pool = p + strlen("pool=");
                p = strchr(_pool, ';');
                if (p) {
                        *p = '\0';
                }
        }

        if (_pool)
                strcpy(pool, _pool);
        else {
                DWARN("super_block not record pool information, set %s\n", POOL_DEFAULT);
                strcpy(pool, POOL_DEFAULT);
        }
}

int disk_get_base_offset(const char *path, disk_t *disk)
{
        char *p = NULL;

        p = strstr(path, "bcache");
        if (p) {
                disk->disk_base_offset = 0;
        } else if (disk->disk_type == __DISK_TYPE_NORMAL_DISK__) {
                if (strlen(disk->cset_uuid) == 0 || strcmp(disk->cset_uuid, "None") == 0)
                        disk->disk_base_offset = 0;
                else
                        disk->disk_base_offset = BCACHE_SUPERBLOCK_LEN;
        } else {
                disk->disk_base_offset = 0;
        }

        DINFO("disk %s base offset %lu\n", path, disk->disk_base_offset);
        return 0;
}

//disk->getpool
int disk_getpool(const disk_t *disk, char *name)
{
        int ret;
        struct ext4_super_block super_block;

        ret = disk_get_superblock(disk, &super_block);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        super_block_decode_pool(&super_block, name);

        return 0;
err_ret:
        return ret;
}

void disk_unlink(const char *home, const disk_t *disk)
{
        char path[MAX_PATH_LEN];

        sprintf(path, "%s/disk/%d.disk", home, disk->idx);
        unlink(path);
}
/*disk->getinfo*/
int disk_getinfo(const char *home, const disk_t *disk, diskinfo_t *diskinfo)
{
        int ret;
        uint32_t crc;
        char buf[DISK_WRITEABLE_OFFSET - DISKINFO_OFFSET] = {0};

        ret = disk->dop->io_pread(disk, buf, sizeof(buf), DISKINFO_OFFSET);
        if (ret != sizeof(buf)) {
                DWARN("read disk[%u], ret %u %s\n", disk->idx, ret, strerror(ret));
                disk_unlink(home, disk);

                EXIT(EAGAIN);
        }

        memcpy(diskinfo, buf, sizeof(diskinfo_t));

        crc = crc32_sum((void *)diskinfo + sizeof(uint32_t), sizeof(*diskinfo) - sizeof(uint32_t));
        if (crc != diskinfo->crc) {
                DWARN("read disk[%u], crc %x %x\n", disk->idx, crc, diskinfo->crc);
                disk_unlink(home, disk);
                YASSERT(0);
                EXIT(EIO);
        }

        return 0;
}


int disk_create(int idx, disk_t **_disk)
{
        int ret;
        disk_t *disk;

        ret = ymalloc((void **)&disk, sizeof(disk_t));
        if (unlikely(ret))
                GOTO(err_ret, ret);

        disk->idx = idx;
        INIT_LIST_HEAD(&disk->delete_list);

        ret = sy_spin_init(&disk->delete_lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = sy_rwlock_init(&disk->lock, "disk.lock");
        if (unlikely(ret))
                GOTO(err_ret, ret);

        disk->private = 0;
        *_disk = disk;

        return 0;
err_ret:
        return ret;
}

/* 1. sata 盘是以本磁盘做软连接加入集群的，
 * 直接可以对磁盘头部进行读写读取superblock
 * 2. spdk 是以空文件做软链接方式加到新的集群中的,
 * 文件名为pci地址
 * 3. testing 中的磁盘本质就是个文件，磁盘大小为文件大小
 * 3. ram.
 *
 * 磁盘加载过过程
 * 1. 正常加载　磁盘加到集群可用
 * 2. 磁盘损坏　磁盘也需要加载到集群, 只需要加载磁盘属于哪个存储池和bitmap(为了恢复磁盘上的数据)
 * 3. bitmap　　损坏磁盘则完全无法加载
 */

static int __disk_getop(const char *home, int idx, struct disk_op_t **_dop)
{
        int ret;
        struct stat stbuf;
        struct disk_op_t *dop;
        char path[PATH_MAX];

        YASSERT(idx < DISK_MAX);

        sprintf(path, "%s/disk/%d.disk", home, idx);

        ret = disk_get_realpath(path, &stbuf);
        if (unlikely(ret)) {
                GOTO(err_ret, ret);
        }

        if (S_ISREG(stbuf.st_mode)) {
                if (strstr(path, "pci")) {
                        DINFO("loading nvme disk\r\n");
#if defined SPDK
                        dop = get_spdk_disk_ops();
#else
                        if(gloconf.nvme)
                                dop = get_nvme_disk_ops();
                        //DWARN("not support spdk!\n");
                        //YASSERT(0);
                        //return EINVAL;
#endif
                } else if (strstr(path, "ram")) {
                        dop = get_ram_disk_ops();
                } else {
                        dop = get_normal_disk_ops();
                }
        } else if (S_ISBLK(stbuf.st_mode)) {
                dop = get_normal_disk_ops();
        } else {
                YASSERT(0 && "why?");
        }

        *_dop = dop;

        return 0;
err_ret:
        return ret;
}


int disk_load(disk_t *disk, const char *home, int new, char *pool, int *bad)
{
        int ret, retry = 0;
        uint64_t disk_size;
        char path[PATH_MAX];

        DINFO("disk disk %d home %s new %d\n", disk->idx, home, new);

        *bad = 0;
        YASSERT(disk && disk->idx < DISK_MAX);

        sprintf(path, "%s/disk/%d.disk", home, disk->idx);

        ret = __disk_getop(home, disk->idx, &disk->dop);
        if (unlikely(ret)) {
                if (ret == ENOENT && !new) {
                        sprintf(path, "%s/block/%d.block", home, disk->idx);
                        ret = path_access(path);
                        if (unlikely(ret)) {
                                GOTO(err_ret, ret);
                        } else {
                                /* 磁盘故障需要将磁盘bitmap加载 */
                                ret = disk_load_bitmap(disk, home);
                                if (unlikely(ret))
                                        GOTO(err_ret, ret);

                                *bad = 1;

                                DWARN("disk disk %u path %s bad %d\n", disk->idx, path, *bad);
                                goto out;
                        }
                }
                GOTO(err_ret, ret);
        }

        YASSERT(disk->dop && disk->dop->open);

        ret = disk->dop->open(disk, home, pool, &disk_size);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (0) {
                int found = 0;
                ret = system_pool_find(pool, &found);
                if (unlikely(ret)) {
                        DWARN("pool %s is invalid, ret %d\n", pool, ret);
                        GOTO(err_ret, ret);
                }

                if (!found) {
                        DWARN("pool %s is invalid, ret %d\n", pool, ret);
                        GOTO(err_ret, ret);
                }
        }

        disk->real_size = disk_size;
        disk->size = _align_down(disk_size, PAGE_SIZE);

        YASSERT(disk->size >= LICH_CHUNK_SPLIT);

retry:
        ret = disk_load_bitmap(disk, home);
        if (unlikely(ret)) {

                /*上次加盘没有完成*/
                if (likely(ret == ENOENT && retry < 2)) {
                        DWARN("load disk [%u] but bitmap not exist, try create!\n", disk->idx);
                        ret = disk_create_bitmap(disk, home, disk->size);
                        if (unlikely(ret)) {
                                GOTO(err_ret, ret);
                        }

                        retry++;
                        goto retry;
                } else {
                        GOTO(err_ret, ret);
                }
        }

        ret = disk_load_tier(disk, home);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = disk->dop->probe_check(disk, home, pool);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        DINFO("disk load disk[%u] %s tier %d size %ld\n", disk->idx, path, disk->tier, disk->size);
out:
        return 0;
err_ret:
        return ret;
}

int disk_getsize(const char *home, int idx, uint64_t *_disk_size)
{
        int ret;
        struct disk_op_t *dop;
        uint64_t disk_size;

        ret = __disk_getop(home, idx, &dop);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = dop->get_size(home, idx, &disk_size);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        DINFO("disk[%u], size %ju\n", idx, disk_size);

        *_disk_size = disk_size;

        return 0;
err_ret:
        return ret;
}


void disk_close_bitmap(disk_t *disk)
{
        munmap(disk->bmap.bits, disk->map_size);
        if (disk->map_fd != -1) {
                close(disk->map_fd);
                disk->map_fd = -1;
        }
}

void __disk_bitmap(const char *home, int idx, char *path)
{
        int ret;

        (void) home;

        if (gloconf.bmap_mem) {
                sprintf(path, "/dev/shm/lich4/bitmap/%d.bitmap", idx);
                ret = path_validate(path, YLIB_NOTDIR, YLIB_DIRCREATE);
                if (unlikely(ret))
                        UNIMPLEMENTED(__DUMP__);
        } else {
                sprintf(path, "%s/bitmap/%d.bitmap", home, idx);
        }
}

static int __nvme_unload(const char *path)
{
        int ret = 0;
        char rpath[MAX_PATH_LEN];

        if (_file_exist(path) != 0) {
                ret = ENOENT;
                DWARN("path %s\n", path);
                GOTO(err_ret, ret);
        }

        if (realpath(path, rpath) == NULL) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        DINFO("path %s realpath %s\n", path, rpath);

        if (_file_exist(rpath) != 0) {
                goto out;
        }

        if (strncmp(basename(rpath), "pci_", 4) == 0) {
                _unlink(rpath, "nvme");
        } else {
                // include /dev/ and file
                DINFO("writezero rpath %s\n", rpath);

                ret = _file_writezero(rpath, 0, 1024 * 1024);
                if (unlikely(ret)) {
                        GOTO(err_ret, ret);
                }
        }

out:
        return 0;
err_ret:
        return ret;
}

/**
 * @todo clear disk head
 * @todo if cannot locate device (include /dev/ and pci_)
 *
 * @param disk
 * @param home
 * @param pool
 */
void disk_unload(disk_t *disk, const char *home, const char *pool)
{
        int idx = disk->idx;
        char path[MAX_PATH_LEN];

        sprintf(path, "%s/disk/%d.disk", home, idx);

        DINFO("disk %d op %p %p path %s exists %d\n", disk->idx, disk, disk->dop, path, _file_exist(path) == 0);

        disk_close_bitmap(disk);

        /* 如果是快坏盘是无法判断磁盘类型　取得磁盘的实例化方法的 */
        if (disk->dop) {
                disk->dop->close(disk);
        }

        yfree((void **)&disk);

        // TODO if NVMe, need to remove the pci file
        __nvme_unload(path);

        _unlink(path, "disk_unload");

        sprintf(path, "%s/block/%d.block", home, idx);
        _unlink(path, "disk_unload");

        sprintf(path, "%s/info/%d.info", home, idx);
        _unlink(path, "disk_unload");

        __disk_bitmap(home, idx, path);
        _unlink(path, "disk_unload");

        sprintf(path, "%s/tier/%d.tier", home, idx);
        _unlink(path, "disk_unload");

        sprintf(path, "%s/speed/%d.speed", home, idx);
        _unlink(path, "disk_unload");

        sprintf(path, "diskstat/%s/%u.stat", pool, idx);
        nodectl_unlink(path);
}

void disk_free(disk_t **disk)
{
        yfree((void **)disk);
}

#if 0
/*disk->initnew*/
int disk_init_newdisk(disk_t *disk, const char *home, const char *pool)
{
        int ret;
        diskinfo_t diskinfo;
        char path[MAX_PATH_LEN];

        ret = disk_setinfo(home, pool, disk, &diskinfo);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        snprintf(path, MAX_PATH_LEN, "%s/info/%d.info", home, disk->idx);

        ret = _set_value(path, (void *)&diskinfo, sizeof(diskinfo), O_CREAT | O_TRUNC);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = bmap_set(&disk->bmap, 0);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (gloconf.bmap_mem) {
                //need not sync
        } else {
                ret = fsync(disk->map_fd);
                if (ret)
                        UNIMPLEMENTED(__DUMP__);
        }

        return 0;
err_ret:
        return ret;
}
#endif

int disk_load_bitmap(disk_t *disk, const char *home)
{
        int ret, fd;
        char path[MAX_PATH_LEN];
        struct stat stbuf;
        void *addr;

        YASSERT(disk->idx < DISK_MAX);

        __disk_bitmap(home, disk->idx, path);

        if (gloconf.bmap_mem) {
                fd = open(path, O_RDWR, 0);
        } else {
                fd = open(path, O_RDWR, O_SYNC);
        }
        if (fd < 0) {
                ret = errno;
                if (ret == ENOENT)
                        goto err_ret;
                else
                        GOTO(err_ret, ret);
        }

        ret = fstat(fd, &stbuf);
        if (ret < 0) {
                ret = errno;
                GOTO(err_fd, ret);
        }

        addr = mmap(0, stbuf.st_size, PROT_WRITE | PROT_READ,
                        MAP_LOCKED | MAP_SHARED, fd, 0);
        if (addr == MAP_FAILED) {
                ret = errno;
                GOTO(err_fd, ret);
        }

        bmap_load(&disk->bmap, addr, stbuf.st_size);
        disk->map_fd = fd;
        disk->map_size = stbuf.st_size;

        return 0;
err_fd:
        close(fd);
err_ret:
        return ret;
}

int disk_create_bitmap(const disk_t *disk, const char *home, uint64_t disk_size)
{
        int ret, fd;
        uint64_t size;
        char path[MAX_PATH_LEN], tmp[MAX_PATH_LEN];
        void *ptr;

        size = (disk_size / LICH_CHUNK_SPLIT) / CHAR_BIT;

        __disk_bitmap(home, disk->idx, path);
        sprintf(tmp, "%s.tmp", path);

        fd = open(tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
        if (fd < 0) {
                ret = errno;
                if (ret == EEXIST) {
                        DERROR("create %s exist\n", tmp);
                        YASSERT(0 && "why?");
                }
                GOTO(err_ret, ret);
        }

        ret = ymalloc((void **)&ptr, size);
        if (ret)
                GOTO(err_fd, ret);

        memset(ptr, 0x0, size);

        ret = _pwrite(fd, ptr, size, 0);
        if (ret < 0) {
                ret = -ret;
                GOTO(err_free, ret);
        }

        yfree((void **)&ptr);
        fsync(fd);
        close(fd);

        ret = rename(tmp, path);
        if (ret < 0) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        return 0;
err_free:
        yfree((void **)&ptr);
err_fd:
        close(fd);
err_ret:
        return ret;
}

int disk_create_bitmap_with(int diskid, uint64_t size, const void *ptr)
{
        int ret, fd;
        char path[MAX_PATH_LEN], tmp[MAX_PATH_LEN];

        __disk_bitmap(NULL, diskid, path);
        sprintf(tmp, "%s.tmp", path);

        struct stat stbuf;
        ret = stat(path, &stbuf);
        YASSERT(ret != 0);
        
        fd = open(tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
        if (fd < 0) {
                ret = errno;
                if (ret == EEXIST) {
                        DERROR("create %s exist\n", tmp);
                        YASSERT(0 && "why?");
                }
                GOTO(err_ret, ret);
        }

        ret = _pwrite(fd, ptr, size, 0);
        if (ret < 0) {
                ret = -ret;
                GOTO(err_fd, ret);
        }

        fsync(fd);
        close(fd);

        ret = rename(tmp, path);
        if (ret < 0) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        return 0;
err_fd:
        close(fd);
err_ret:
        return ret;
}

int disk_load_tier(disk_t *disk, const char *home)
{
        int ret;
        char path[MAX_PATH_LEN], buf[MAX_BUF_LEN];

        sprintf(path, "%s/tier/%d.tier", home, disk->idx);
        ret = _get_text(path, buf, MAX_NAME_LEN);
        if (ret < 0) {
                ret = -ret;
                if (ret == ENOENT) {
                        disk->tier = -1;
                } else
                        GOTO(err_ret, ret);
        } else {
                disk->tier = atoi(buf);
        }

        return 0;
err_ret:
        return ret;
}

int __disk_get_reserved_item(const char *reserved, const char *key, char *value)
{
        char tmp[MAX_INFO_LEN];
        char *p0 = NULL, *p;

        YASSERT(MAX_INFO_LEN >= 436);

        strcpy(tmp, reserved);

        p = strstr(tmp, key);
        if (p) {
                p0 = p + strlen(key);

                p = strchr(p0, ';');
                if (p) {
                        *p = '\0';
                }
        }

        if (p0)
                strcpy(value, p0);
        else
                value[0] = '\0';

        return 0;
}

int disk_getcache(disk_t *disk, int *cache)
{
        int ret;
        struct ext4_super_block super_block;
        char value[RESERVED_ITEM_MAX_LEN];

        ret = disk->dop->io_pread(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        memset(value, 0x0, RESERVED_ITEM_MAX_LEN);
        ret = __disk_get_reserved_item((const char *)super_block.s_reserved, "cache=", value);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (value[0] != '\0')
                *cache = atoi(value);
        else
                *cache = 0;

        memset(value, 0x0, RESERVED_ITEM_MAX_LEN);
        ret = __disk_get_reserved_item((const char *)super_block.s_reserved, "cset=", value);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        strcpy((char *)&disk->cset_uuid, value);

        DINFO("disk %d reserved %s cache %d cset %s\n", disk->idx,
              (char *)super_block.s_reserved, *cache, (char *)disk->cset_uuid);

        return 0;
err_ret:
        return ret;
}

int disk_iscached(const disk_t *disk, int *cached)
{
        int ret;
        char tmp[MAX_INFO_LEN];
        char *pcache = NULL, *p;
        struct ext4_super_block super_block;
 
        ret = disk->dop->io_pread(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        strcpy(tmp, (char *)&super_block.s_reserved);
        p = strstr(tmp, "cached=");
        if (p) {
                pcache = p + strlen("cached=");

                p = strchr(pcache, ';');
                if (p) {
                        *p = '\0';
                }
        }

        if (pcache)
                *cached = atoi(pcache);
        else
                *cached = 0;

        return 0;
err_ret:
        return ret;
}

int disk_setcached(const disk_t *disk, const char *home)
{
        int ret;
        char *pcache = NULL, *p;
        struct ext4_super_block super_block;
        char tmp[MAX_INFO_LEN], block[MAX_PATH_LEN];

        ret = disk->dop->io_pread(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
        if (ret != sizeof(super_block)) {
                ret = EIO;
                GOTO(err_ret, ret);
        }

        strcpy(tmp, (char *)&super_block.s_reserved);
        p = strstr(tmp, "cached=");
        if (likely(p)) {
                pcache = p + strlen("cached=");
                *pcache = '1';

                strcpy((char *)super_block.s_reserved, tmp);
                ret = disk->dop->io_pwrite(disk, (char *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET);
                if (ret != sizeof(super_block)) {
                        ret = EIO;
                        GOTO(err_ret, ret);
                }
        } else {
                YASSERT(0);
        }

        snprintf(block, MAX_PATH_LEN, "%s/block/%d.block", home, disk->idx);

        ret = _set_value_off(block, (void *)&super_block, sizeof(super_block), SUPER_BLOCK_OFFSET, O_CREAT | O_TRUNC);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

#if 1
STATIC int __diskmd_bmap_get_empty____(disk_t *disk, uint32_t *_idx)
{
        int ret, idx;

        DBUG("get empty normally\n");
        idx = bmap_get_empty(&disk->bmap);
        YASSERT(idx != -1);

        ret = bmap_set(&disk->bmap, idx);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        *_idx = idx;

        return 0;
err_ret:
        return ret;
}

#else

STATIC int __diskmd_bmap_get_empty____(disk_t *disk, uint32_t *_idx)
{
        int ret, i, idx;
        uint64_t a, b;

        // TODO 先随机查找，如果100次内还找不到，则正常查找
        // 而随机对顺序IO不利，特别是HDD的情况
        a = fastrandom();
        b = fastrandom();

        for (i = 0; i < 100; i++) {
                idx = (a + b * i) % disk->bmap.size;
                ret = bmap_set(&disk->bmap, idx);
                if (unlikely(ret)) {
                        YASSERT(ret == EEXIST);
                        continue;
                }

                break;
        }

        if (i == 100) {
                DBUG("get empty normally\n");
                idx = bmap_get_empty(&disk->bmap);
                YASSERT(idx != -1);

                ret = bmap_set(&disk->bmap, idx);
                if (unlikely(ret))
                        GOTO(err_ret, ret);
        }

        *_idx = idx;

        return 0;
err_ret:
        return ret;
}

#endif

STATIC int __diskmd_bmap_get_empty__(disk_t *disk, locs_t *locs, int count)
{
        int ret, i;

        ANALYSIS_BEGIN(0);

        if (disk->bmap.size - disk->bmap.nr_one < count + 1) {
                ret = ENOSPC;
                DERROR("disk[%u] full\n", disk->idx);
                GOTO(err_ret, ret);
        }

        if (disk->status & __DISK_OFFLINE__) {
                ret = ENODEV;
                DERROR("disk[%u] offline\n", disk->idx);
                GOTO(err_ret, ret);
        }

        for (i = 0; i < count; i++) {
                ret = __diskmd_bmap_get_empty____(disk, &locs[i].loc.idx);
                if (unlikely(ret))
                        GOTO(err_ret, ret);

                locs[i].valid = TRUE;
                locs[i].loc.diskid = disk->idx;
        }

        if (gloconf.bmap_mem) {
                //need not sync
        } else {
                ret = msync(disk->bmap.bits, disk->map_size, MS_SYNC);
                if (ret)
                        UNIMPLEMENTED(__DUMP__);
        }
        
        ANALYSIS_QUEUE(0, IO_WARN, "diskmd_getempty");

        return 0;
err_ret:
        return ret;
}

int disk_bmap_get_empty(disk_t *disk, locs_t *locs, int count)
{
        int ret;

        ret = sy_rwlock_wrlock(&disk->lock);
        if (ret)
                GOTO(err_ret, ret);

        ANALYSIS_BEGIN(0);

        ret = __diskmd_bmap_get_empty__(disk, locs, count);
        if (ret)
                GOTO(err_lock, ret);

        ANALYSIS_QUEUE(0, IO_WARN, "disk_bmap_get_empty");

        sy_rwlock_unlock(&disk->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&disk->lock);
err_ret:
        return ret;
}


int disk_exists(disk_t *disk, const diskloc_t *loc, int _exist)
{
        int ret, exist = 0;

        ret = sy_rwlock_rdlock(&disk->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        exist = bmap_get(&disk->bmap, loc->idx);

        sy_rwlock_unlock(&disk->lock);

        YASSERT(!exist == !_exist);

        return 0;
err_ret:
        return ret;
}

int disk_delete(disk_t *disk, const diskloc_t *loc)
{
        int ret;
        delete_request_t *delete_request;
        ret = ymalloc((void **)&delete_request, sizeof(*delete_request));
        if (unlikely(ret))
                GOTO(err_ret, ret);

        delete_request->loc = *loc;
        ret = sy_spin_lock(&disk->delete_lock);
        if (unlikely(ret))
                GOTO(err_free, ret);

        list_add_tail(&delete_request->hook, &disk->delete_list);

        sy_spin_unlock(&disk->delete_lock);

        sem_post(disk->sem);
        return 0;
err_free:
        yfree((void **)&delete_request);
err_ret:
        return ret;
}

int disk_extinfo_set(const char *home, int idx, const disk_extinfo_t *disk_extinfo)
{
        int ret;
        char path[MAX_PATH_LEN];
        
        snprintf(path, MAX_PATH_LEN, "%s/info/%d.extinfo", home, idx);

        ret = _set_value(path, (void *)disk_extinfo, sizeof(disk_extinfo), O_CREAT | O_TRUNC);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

int disk_extinfo_get(const char *home, int idx, disk_extinfo_t *disk_extinfo)
{
        int ret;
        char path[MAX_PATH_LEN];
        char buf[MAX_BUF_LEN];
        
        snprintf(path, MAX_PATH_LEN, "%s/info/%d.extinfo", home, idx);

        ret = _get_value(path, buf, MAX_BUF_LEN);
        if (unlikely(ret < 0)) {
                ret = -ret;
                GOTO(err_ret, ret);
        }

        memset(disk_extinfo, 0x0, sizeof(*disk_extinfo));
        memcpy(disk_extinfo, buf, sizeof(*disk_extinfo));

        return 0;
err_ret:
        return ret;
}

void disk_extinfo_remove(const char *home, int idx)
{
        char path[MAX_PATH_LEN];
        
        snprintf(path, MAX_PATH_LEN, "%s/info/%d.extinfo", home, idx);
        unlink(path);
}
